# import libraries and define global settingsimport numpy as npimport pandas as pdimport seaborn as snsimport scipy.stats as statsimport matplotlib.pyplot as plt# The code below define global figure properties used for publication.# import matplotlib_inline.backend_inline# matplotlib_inline.backend_inline.set_matplotlib_formats('svg') # display figures in vector formatplt.rcParams.update( {'font.size':14, # font size'savefig.dpi':300, # output resolution'axes.titlelocation':'left',# title location'axes.spines.right':False, # remove axis bounding box'axes.spines.top':False, # remove axis bounding box })# setting seednp.random.seed(10)
# Create normally distributed dataN =100data = np.random.randn(N)# and add two random outliers in random positionsdata[np.random.choice(np.arange(N),2)] = np.random.uniform(2,3,2)**2# and plotplt.figure(figsize=(8,4))plt.plot(data,'ks',markersize=10,markerfacecolor=(.7,.7,.7))plt.xlim([-2,N+1])plt.xlabel('Data index')plt.ylabel('Data value')plt.tight_layout()plt.show()
R version:
Code
# Create normally distributed dataN =100y =rnorm(N)data =tibble(data_index =1:length(y), data_value = y)# and add two random outliers in random positiionsdata[sample(N, 2), "data_value"] =runif(2, min=2, max=3)^2# and plot:p7.4=ggplot(data, aes(x = data_index, y = data_value)) +geom_point(shape=22, size=5,fill ="gray",alpha=.8 ) +xlim(-2, N+1) +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(y ="Data value", x ="Data index")p7.4
Figure 7.6: Z-score method for identifying outliers
Python version:
Code
# outlier thresholdzThreshold =3.09# create some raw dataN =135data = np.exp(np.random.randn(N)/2) +5# zscore the datadataZ = (data-np.mean(data)) / np.std(data,ddof=1)# identify data indices containing outliersoutliers = np.where(np.abs(dataZ)>zThreshold)[0]# and plot_,axs = plt.subplots(1,2,figsize=(10,4))axs[0].plot(data,'ks',markersize=10,markerfacecolor=(.7,.7,.7))axs[0].set(xlim=[-2,N+1],xlabel='Data index',ylabel='Data value')axs[0].set_title(r'$\bf{A}$) Original data')axs[1].plot(dataZ,'ks',markersize=10,markerfacecolor=(.9,.9,.9))axs[1].axhline(zThreshold,linestyle='--',color=(.9,.9,.9))axs[1].plot(outliers,dataZ[outliers],'kx',markersize=10,markeredgewidth=2)axs[1].set(xlim=[-3,N+2],xlabel='Data index',ylabel='Transformed data value')axs[1].set_title(r'$\bf{B}$) Z-transformed data')plt.tight_layout()plt.show()
Figure 7.7: Impact of removing outliers on z-values
Python version:
Code
# create some raw dataN =10# sample sizedata = np.exp(np.random.randn(N)/2) +5data[-1] = np.max(data)+2# impose an outlier (at the end for convenience)xvals = np.arange(N)dataZ1 = (data-np.mean(data)) / np.std(data,ddof=1)dataZ2 = (data[:-1]-np.mean(data[:-1])) / np.std(data[:-1],ddof=1)_,axs = plt.subplots(1,2,figsize=(10,4))axs[0].plot(xvals,data,'ks',markersize=10,markerfacecolor=(.7,.7,.7))axs[0].set(xticks=[],xlabel='Data index',ylabel='Raw data value')axs[0].set_title(r'$\bf{A}$) Raw data')axs[1].plot(xvals,dataZ1,'ks',markersize=10,markerfacecolor=(.7,.7,.7),label='Z with outlier')axs[1].plot(xvals[:-1],dataZ2,'ko',markersize=10,markerfacecolor=(.5,.5,.5),label='Z without outlier')axs[1].set(xticks=[],xlabel='Data index',ylabel='Transformed data value')axs[1].legend()axs[1].set_title(r'$\bf{B}$) Z-transformed data')# draw lines connection pre/post-removal valuesfor d,z,x inzip(dataZ1[:-1],dataZ2,xvals[:-1]): axs[1].plot([x,x],[d,z],':',color=(.7,.7,.7),zorder=-10)plt.tight_layout()plt.show()
R version:
Code
# create some raw dataN =10# sample sizedata =exp(rnorm(N)/2) +5data[N] =max(data) +2# impose an outlier (at the end for convenience)dataZ1 = (data -mean(data)) /sd(data) # sd uses n-1 dofdataZ2 = (data[-N] -mean(data[-N])) /sd(data[-N]) # sd uses n-1 dof# and plot:data =tibble(data_index =1:length(data), data_value = data)pA =ggplot(data, aes(x = data_index, y = data_value)) +geom_point(shape=22, size=5,fill ="gray" ) +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.text.x=element_blank(),axis.ticks.x=element_blank(),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(title =bquote(bold("A)")~"Raw data"),y ="Raw data value", x ="Data index")dataZ1 =tibble(data_index =1:length(dataZ1), data_value = dataZ1, id="Z with outlier")dataZ2 =tibble(data_index =1:length(dataZ2), data_value = dataZ2, id="Z without outlier")dataZ =rows_append(dataZ1, dataZ2)pB =ggplot(dataZ, aes(x = data_index, y=data_value, group = data_index,shape=id,fill=id)) +geom_line(linetype="dashed", linewidth=.3 ) +geom_point(size=5 ) +scale_shape_manual(values=c("Z with outlier"=22,"Z without outlier"=21)) +scale_fill_manual(values=c("Z with outlier"="#bdbdbd","Z without outlier"="#525252")) +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.text.x=element_blank(),axis.ticks.x=element_blank(),legend.position =c(.8,.9),legend.title =element_blank(),legend.box.background =element_rect(color="gray", linewidth =1),legend.text =element_text(size =12),legend.key =element_blank(),legend.key.width =unit(1, "cm"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(shape="", fill="",title =bquote(bold("B)")~"Z-transformed data"),y ="Transformed data value", x ="Data index")p7.7= pA + pBp7.7
Figure 7.9: Data trimming
Python version:
Code
N =74data = np.random.randn(N)**3# find largest and smallest valuesk =2sortidx = np.argsort(data)minvals = sortidx[:k]maxvals = sortidx[-k:]_,axs = plt.subplots(2,1,figsize=(8,6))axs[0].plot(data,'ks',markersize=10,markerfacecolor=(.9,.9,.9))axs[0].plot(minvals,data[minvals],'kx',markersize=10,markeredgewidth=2)axs[0].plot(maxvals,data[maxvals],'kx',markersize=10,markeredgewidth=2)axs[0].set_title(r'$\bf{A}$) Data with k-extreme points trimmed')# create a Gaussian probability curve for the panel Bx = np.linspace(-4,4,401)gpdf = stats.norm.pdf(x)# the find the indices of the 2.5% and 97.5%lbndi = np.argmin(np.abs(x-stats.norm.ppf(.025))) # lbndi = Lower BouND Indexubndi = np.argmin(np.abs(x-stats.norm.ppf(1-.025)))# plot the probability function and the vertical linesaxs[1].plot(x,gpdf,'k',linewidth=2)axs[1].axvline(x[lbndi],color=(.5,.5,.5),linewidth=.5,linestyle='--')axs[1].axvline(x[ubndi],color=(.5,.5,.5),linewidth=.5,linestyle='--')axs[1].set(xlim=x[[0,-1]],ylim=[0,.42])axs[1].set_title(r'$\bf{B}$) Histogram showing trimmed areas')# now create patches for the rejected areaaxs[1].fill_between(x[:lbndi+1],gpdf[:lbndi+1],color='k',alpha=.4)axs[1].fill_between(x[ubndi:],gpdf[ubndi:],color='k',alpha=.4)# and saveplt.tight_layout()plt.show()
R Version:
Code
N =74y =rnorm(N)^3data =tibble(data_index=1:length(y), data_value=y)# find largest and smallest valuesk =2# sorting ascending order:data =arrange(data, data_value)largest =head(data, k)smallest =tail(data, k)pA =ggplot(data, aes(x = data_index, y = data_value)) +geom_point(shape=22, size=5,fill ="gray" ) +geom_point(data = largest,aes(x = data_index, y = data_value),shape=7,size=5,stroke=1,fill ="gray",alpha=.8 ) +geom_point(data = smallest,aes(x = data_index, y = data_value),shape=7,size=5,stroke=1,fill ="gray",alpha=.8 ) +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(title =bquote(bold("A)")~"Data with k-extreme points trimmed"),y ="", x ="")# create a Gaussian probability curve for the panel Bx =seq(-4,4,length=401)gpdf =dnorm(x, mean =0, sd =1)data =tibble(data_index = x, data_value = gpdf)# find the indices of the 2.5% and 97.5%lbndi =which.min(abs(x -qnorm(0.025))) # lower bound indexubndi =which.min(abs(x -qnorm(0.975))) # upper bound index# plot the probability function and the vertical linespB =ggplot(data = data, aes(x = data_index, y = data_value)) +geom_line() +geom_vline(xintercept = x[lbndi], linetype="dashed") +geom_ribbon(data =filter(data, data_index <= x[lbndi]), aes(ymax = data_value, ymin=0),fill="#969696") +geom_vline(xintercept = x[ubndi], linetype="dashed") +geom_ribbon(data =filter(data, data_index >= x[ubndi]), aes(ymax = data_value, ymin=0),fill="#969696") +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(title =bquote(bold("B)")~"Histogram showing trimmed areas"),y ="", x ="")p7.9= pA + pB +plot_layout(ncol=1)p7.9
Exercise 1
Python version:
Code
## iterative method# Note about this code: Because of random numbers, you are not guaranteed to get a result# that highlights the method. Try running the code several times.N =30data = np.random.randn(N)data[data<-1] = data[data<-1]+2data[data>1.5] = data[data>1.5]**2;# try to force a few outliers# pick a lenient threshold just for illustrationzscorethresh =2dataZ = (data-np.mean(data)) / np.std(data,ddof=1)plt.figure(figsize=(10,4))colorz ='brkmc'numiters =0# iteration counterwhileTrue:# convert to z datamean = np.nanmean(dataZ) datastd = np.nanstd(dataZ,ddof=1) dataZ = (dataZ-datamean) / datastd# find data values to remove toremove = dataZ>zscorethresh# break out of while loop if no points to removeifsum(toremove)==0:breakelse:# otherwise, mark the outliers in the plot plt.plot(np.where(toremove)[0]+numiters/5,dataZ[toremove],'%sx'%colorz[numiters], markersize=12,markeredgewidth=3) dataZ[toremove] = np.nan# replot plt.plot(np.arange(N)+numiters/5,dataZ,linestyle='None',marker=f'${numiters}$',markersize=12, color=colorz[numiters])# update counter numiters = numiters +1plt.ylabel('Z-score')plt.xlabel('Data index')plt.tight_layout()plt.show()
R version:
Some notes: if I understand correctly, you want to simulate some data, add some outliers and calculate the z-scores and removing them to see the behavior of the dataset.
In each iteration, you want to find all outliers, plot the all the datapoints, and mark the outliers to remove them in the next round.
If that’s the case, I take the liberty to reorganize the code and do the following:
Commented the part of the original data does not get z-scored twice in the first iteration;
Reorganized the replot section in the for-loop so that the data points are plotted even when no outlier is found. This will prevent the case of no-plotting when no outlier is found in the first iteration.
Please, let me know if these changes make sense. In any case, I can follow strictly to the python code.
Code
## iterative method# Note about this code: Because of random numbers, you are not guaranteed to get a result# that highlights the method. Try running the code several times.# Simulate the data:N =30data =rnorm(N)data[data <-1] = data[data <-1] +2data[data >1.5] = data[ data >1.5]^2# pick a lenient threshold just for illustrationzscorethresh =2# ======================================================= # # if this part is included, then the first iteration will be # z-scored two times.# dataZ = (data-mean(data)) / sd(data)# color pallete:colorz =c("blue", "black", "red", "magenta", "cyan")# start the skeleton of the plot:p =ggplot() +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(y ="Z-score", x ="Data index")# initiate counter:numiters =1# initiate the dataZ with original data:dataZ = datawhile(TRUE) {# convert to z-score: datamean =mean(dataZ, na.rm = T) datastd =sd(dataZ, na.rm = T) dataZ = (dataZ - datamean)/datastd# find data values to remoove: toRemove = dataZ > zscorethresh# plot points in the current iteration: nmb = numiters color_ = colorz[nmb] data_temp =tibble(data_index =1:N + numiters/5, data_value = dataZ) p <- p +geom_text(data = data_temp,aes(x = data_index, y = data_value),label = nmb,size =5,color = color_ )# break out of the while loop if no points to remove:if (sum(toRemove, na.rm = T) ==0) {break } else{# otherwise, mark the outliers in the plot for the current iteration: outlier_data =tibble(data_index =which(toRemove) + numiters/5,data_value = dataZ[toRemove] ) color_ = colorz[numiters] p <- p +geom_point(data = outlier_data,aes(x=data_index, y=data_value),shape =4,color = color_,size=2,stroke=1 ) dataZ[toRemove] =NaN }# update counter: numiters <- numiters +1}p
Exercise 2
Python version
Code
# create dataN =10000Y = np.exp(np.sin(np.random.randn(N)))# make a copy of the data to manipulateYc = Y.copy()# not specified in the instructions, but always a good idea to inspect the data!plt.hist(Y,bins=40)plt.show()
Code
# percent to remove (two-tailed)k =4# convert that to a number of data points to remove from each tailpnts2nan =int( (k/2)/100* N ) # with stated parameters, this should be 200# find the data sortingsort_idx = np.argsort(Y)# nan the two tails separatelyYc[sort_idx[:pnts2nan]] = np.nanYc[sort_idx[-pnts2nan:]] = np.nan# confirm the right numbers of pointsprint(f'Total dataset size: {len(Yc)}')print(f'Valid dataset size: {np.sum(~np.isnan(Yc))}')
Code
# compute the mean and median (also used in the next exercise)meanY = np.mean(Y)medianY = np.median(Y)# print the meansprint(f'Mean of original: {meanY:.3f}')print(f'Mean of trimmed: {np.nanmean(Yc):.3f}')# print the mediansprint(' ')print(f'Median of original: {medianY:.3f}')print(f'Median of trimmed: {np.nanmedian(Yc):.3f}')
R version:
Code
# create dataN =10000Y =exp(sin(rnorm(N)))# make a copy of the data to manipulate:data =tibble(data_index =1:length(Y), data_value = Y)ggplot(data, aes(x = data_value)) +geom_histogram(bins =40, fill="#2c7fb8", color="white") +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(y ="", x ="")
Code
# percent to remove (two-tailed)k =4# finding threshold in the distribution:q = k/2# percent from each sidethresh =quantile(data$data_value, c(q/100, 1-q/100))# data with two-tailed points removed:data_twoTailed_rmvd = data %>%filter(data_value > thresh[1] & data_value < thresh[2])# confirm the right number of points:print(glue("Total dataset size: {nrow(data)}"))
# compute the mean and median (also used in the next exercise)mean_Y =mean(data$data_value)median_Y =median(data$data_value)mean_Ytrimmed =mean(data_twoTailed_rmvd$data_value)median_Ytrimmed =median(data_twoTailed_rmvd$data_value)# print the meansprint(glue("Mean of original: {sprintf('%.3f', mean_Y)}"))
Mean of original: 1,223
Code
print(glue("Mean of trimmed: {sprintf('%.3f', mean_Ytrimmed)}"))
Mean of trimmed: 1,210
Code
# print the mediansprint(glue("Median of original: {sprintf('%.3f', median_Y)}"))
Median of original: 0,987
Code
print(glue("Median of trimmed: {sprintf('%.3f', median_Ytrimmed)}"))
Median of trimmed: 0,987
Exercise 3
Python version:
Code
# the range of k valuesks = np.arange(1,50,3)# initialize a results matrix for mean/medianresults = np.zeros((len(ks),2))# the experiment!for idx,ki inenumerate(ks):# make a new copy of the original data Yc = Y.copy() # Note: Y was defined in Exercise 2# convert that to a number of data points to remove from each tail pnts2nan =int( (ki/2)/100* N )# nan the two tails separately Yc[sort_idx[:pnts2nan]] = np.nan Yc[sort_idx[-pnts2nan:]] = np.nan# collect mean and median results[idx,0] =100*(np.nanmean(Yc)-meanY) / meanY results[idx,1] =100*(np.nanmedian(Yc)-medianY) / medianYprint(f'Total/valid dataset size: {len(Yc)} -> {np.sum(~np.isnan(Yc))}')
Code
# plot the resultsplt.figure(figsize=(8,4))plt.plot(ks,results[:,0],'s-',color=[.6,.6,.6],markerfacecolor=[.8,.8,.8],markersize=10,label='Mean')plt.plot(ks,results[:,1],'o-',color='k',markerfacecolor=[.4,.4,.4],markersize=10,label='Median')plt.legend()plt.xlabel('k% to trim')plt.ylabel(r'Descriptive value (%$\Delta$)')plt.tight_layout()plt.show()
R version
Code
# the range of k valuesks =seq(1, 50, 3)# initialize a results matrix for mean/medianresults =matrix(0, length(ks), 2)# the experiment!for (idx inseq_along(ks)) {#idx = 1# convert that to a number of data points to remove from each tail: q = ks[idx]/2# get the thresholds: thresh =quantile(data$data_value, c(q/100, 1-q/100))# get the remaining data points: data_temp = data %>%filter(data_value > thresh[1] & data_value < thresh[2])# collect mean and median and calculate % difference from original ones: results[idx, 1] =100*(mean(data_temp$data_value) -mean(data$data_value)) /mean(data$data_value) results[idx, 2] =100*(median(data_temp$data_value) -median(data$data_value)) /median(data$data_value)print(glue("Total/valid dataset size: {nrow(data)} -> {nrow(data_temp)}"))}
ggplot(tibble(data_index = ks, mean = results[,1], median = results[,2])) +geom_point(aes(x = data_index, y = mean, color ="Mean",fill ="Mean",shape ="Mean" ),size=4 ) +geom_line(aes(x = data_index, y = mean, color ="Mean")) +geom_point(aes(x = data_index, y = median,color ="Median", fill ="Median", shape ="Median"),size=4 ) +geom_line(aes(x = data_index, y = median, color="Median")) +scale_shape_manual(values =c("Mean"=22,"Median"=21) ) +scale_color_manual(values =c("Mean"="#bdbdbd","Median"="#525252") ) +scale_fill_manual(values =c("Mean"="#bdbdbd","Median"="#525252") ) +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), legend.position =c(.1,.1),legend.title =element_blank(),legend.box.background =element_rect(color="gray", linewidth =1),legend.text =element_text(size =12),legend.key =element_blank(),legend.key.width =unit(1, "cm"),axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(fill="", color ="", shape ="", y =expression(paste("Descriptive value (%",Delta,")")), x ="k% to trim")
Exercise 4
Python version:
Code
# create dataN =1000X = np.random.f(5,100,size=N)# zscore dataXz = (X-np.mean(X)) / np.std(X,ddof=1)zThresh =3# clean dataXclean = X[Xz<zThresh]# report number of removed data pointsprint(f'Original sample size: {N}')print(f'Cleaned sample size: {len(Xclean)}')print(f'Percent data removed: {100*(1-len(Xclean)/N):.2f}%')
Code
# histogram bins using FD ruley_fd_all = np.histogram(X,bins='fd')y_fd_clean = np.histogram(Xclean,bins='fd')# histogram bins using set boundaries# Note that I create the bin boundaries using k+1 numbers, then input that vector of boundaries into np.histogramedges = np.linspace(np.min(X),np.max(X),41)y_40_all = np.histogram(X,bins=edges)y_40_clean = np.histogram(Xclean,bins=edges)# plotting the histograms_,axs = plt.subplots(2,1,figsize=(8,7))axs[0].plot((y_fd_all[1][:-1]+y_fd_all[1][1:])/2,y_fd_all[0],'ks-', label='Pre-cleaned',markersize=11,markerfacecolor=(.6,.6,.6))axs[0].plot((y_fd_clean[1][:-1]+y_fd_clean[1][1:])/2,y_fd_clean[0],'ko-', label='Cleaned',markersize=10,markerfacecolor=(.9,.9,.9))axs[0].set_title(r'$\bf{A}$) Histograms using F-D rule')axs[1].plot((y_40_all[1][:-1]+y_40_all[1][1:])/2,y_40_all[0],'ks-', label='Pre-cleaned',markersize=11,markerfacecolor=(.6,.6,.6))axs[1].plot((y_40_clean[1][:-1]+y_40_clean[1][1:])/2,y_40_clean[0],'ko-', label='Cleaned',markersize=10,markerfacecolor=(.9,.9,.9))axs[1].set_title(r'$\bf{B}$) Histograms using 40 bins')# axis adjustmentsfor a in axs: a.legend() a.set(xlabel='F value',ylabel='Count', xlim=[np.min(X)-.02,np.max(X)+.02],xticks=range(6))plt.tight_layout()plt.savefig('dataQC_ex4.png')plt.show()
R Version
Code
N =1000x =rf(df1 =5, df2 =100, n = N)# zscore dataxZ = (x -mean(x)) /sd(x) # sd uses n-1 dofzThresh =3# clean dataxClean = x[xZ < zThresh]# report number of removed data points:print(glue("Original sample size: {N}"))
# boxplots of raw dataplt.figure(figsize=(10,5))sns.boxplot(data=df).set(xlabel='Data feature',ylabel='Data value')plt.show()
Code
# make a copy of the original data matrixdf_z = df.copy()for col in df_z.columns:ifnot (col=='sex'): df_z[col] = (df[col] - df[col].mean()) / df[col].std(ddof=1)# inspect againdf_z
Code
# box plots of z-scored dataplt.figure(figsize=(10,5))sns.boxplot(data=df_z).set(xlabel='Data feature',ylabel='Data value')plt.show()
Code
# Note: this cell combines the previous graphs to make one figure for the book_,axs = plt.subplots(2,1,figsize=(10,7))sns.boxplot(data=df, ax=axs[0]).set(xticks=[],ylabel='Data value')axs[0].set_title(r'$\bf{A}$) Raw data')sns.boxplot(data=df_z,ax=axs[1]).set(xlabel='Data feature',ylabel='Transformed data value')axs[1].set_title(r'$\bf{B}$) Z-transformed data')plt.tight_layout()plt.show()
# scale the data:cols_ =setdiff(names(data), "sex") # all columns, but 'sex' featuredataZ = data %>%mutate(across(all_of(cols_), function(x) (x -mean(x))/sd(x)))# inspecting againhead(dataZ)
# compute percent changepctchange =100*(cleaned_means-raw_means) / raw_means# and plotplt.figure(figsize=(9,4))plt.plot(pctchange,'ks',markersize=14,markerfacecolor=(.7,.7,.7))plt.axhline(0,color='k',linewidth=2,zorder=-1)plt.xticks(range(9),labels=df.columns)plt.ylabel('Percent')plt.title('Change in feature means after z-score data rejection',loc='center')plt.grid()plt.tight_layout()plt.savefig('dataQC_ex6b.png')plt.show()
R version :
Code
# remove based on z-score thresholdzThresh =3.09# p<.001# creating copy:data_clean = data# loop over all columnsfor (col_ incolnames(dataZ)) { data_clean[[col_]][dataZ[[col_]] > zThresh] =NA data_clean[[col_]][dataZ[[col_]] <-zThresh] =NA}
Code
pA =ggplot(stack(data), aes(x = ind, y = values, fill=ind)) +geom_boxplot() +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), legend.position ="none",legend.title =element_blank(),legend.box.background =element_rect(color="gray", linewidth =1),legend.text =element_text(size =12),legend.key =element_blank(),legend.key.width =unit(1, "cm"),axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(fill="", color ="", shape ="", title ="",y ="Data value", x ="")pB =ggplot(stack(data_clean), aes(x = ind, y = values, fill=ind)) +geom_boxplot() +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), legend.position ="none",legend.title =element_blank(),legend.box.background =element_rect(color="gray", linewidth =1),legend.text =element_text(size =12),legend.key =element_blank(),legend.key.width =unit(1, "cm"),axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(fill="", color ="", shape ="", title ="",y ="Data value", x ="Data features")pA + pB +plot_layout(ncol=1)
# compute percent changepctchange =100*(clean_means - raw_means) / raw_meansggplot(data=stack(pctchange), aes(x = ind, y = values)) +geom_hline(yintercept =0,color="black" ) +geom_point(shape =22,fill ="#969696",size=6 ) +theme_bw() +theme(legend.position ="none",legend.title =element_blank(),legend.box.background =element_rect(color="gray", linewidth =1),legend.text =element_text(size =12),legend.key =element_blank(),legend.key.width =unit(1, "cm"),axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15),plot.title =element_text(hjust = .5) ) +labs(title ="Change in feature means after z-score data rejection",y ="Percent", x ="")
Source Code
---title: "Python to R translation"author: "Cleyton Farias"format: html: code-fold: true code-tools: true self-contained: trueengine: knitrexecute: cache: true fig-width: 6.4 fig-height: 4.8 warning: false messages: false---```{python}#| label: python-setup#| code-summary: Python Setup# import libraries and define global settingsimport numpy as npimport pandas as pdimport seaborn as snsimport scipy.stats as statsimport matplotlib.pyplot as plt# The code below define global figure properties used for publication.# import matplotlib_inline.backend_inline# matplotlib_inline.backend_inline.set_matplotlib_formats('svg') # display figures in vector formatplt.rcParams.update( {'font.size':14, # font size'savefig.dpi':300, # output resolution'axes.titlelocation':'left',# title location'axes.spines.right':False, # remove axis bounding box'axes.spines.top':False, # remove axis bounding box })# setting seednp.random.seed(10)``````{r}#| label: r-setup#| code-summary: R Setup#| message: false#| warning: false# Attaching packageslibrary(dplyr)library(glue)library(stringr)library(ggplot2)library(patchwork)# setting seedset.seed(10)```# Figure 7.2: What to look for in visual inspection of data## Python version:```{python}_,axs = plt.subplots(2,2,figsize=(12,6))# panel A: unexpected rangexA = np.concatenate((np.random.randn(20),np.random.randn(80)*30),axis=0)axs[0,0].plot(xA,'ks',markersize=10,markerfacecolor=(.7,.7,.7),alpha=.8)axs[0,0].set(xlabel='Data index',xticks=[],yticks=[],ylabel='Data value')axs[0,0].set_title(r'$\bf{A}$) Unexpected data range')# panel B: distribution shapexB = np.concatenate((5+np.random.randn(150),np.exp(1+np.random.randn(150))),axis=0)axs[0,1].hist(xB, bins='fd',edgecolor='k',facecolor=(.7,.7,.7))axs[0,1].set(xlabel='Data value',xticks=[],yticks=[],ylabel='Count')axs[0,1].set_title(r'$\bf{B}$) Nonstandard distribution')# panel C: mixed datasetsxC = np.concatenate((4+np.random.randn(150),np.random.randn(150)-4),axis=0)axs[1,0].hist(xC,bins=50,edgecolor='k',facecolor=(.7,.7,.7))axs[1,0].set(xlabel='Data value',xticks=[],yticks=[],ylabel='Count')axs[1,0].set_title(r'$\bf{C}$) Mixed dataset')# panel D: outliersxD = np.random.randn(150)xD[60] =10xD[84] =14axs[1,1].plot(xD,'ks',markersize=10,markerfacecolor=(.7,.7,.7),alpha=.8)axs[1,1].set(xlabel='Data index',xticks=[],yticks=[],ylabel='Data value')axs[1,1].set_title(r'$\bf{B}$) Outliers')# exportplt.tight_layout()plt.savefig('dataQC_qualInspection.png')plt.show()```## R version:```{r}#| message: false#| fig-width: 12#| fig-height: 6# panel A: unexpected rangexA=c(rnorm(20), rnorm(80)*80)data_A =tibble(data_index=1:length(xA), data_value = xA)pA =ggplot(data_A, aes(x=data_index, y = data_value)) +geom_point(shape=22,size=5,fill ="gray",alpha=.8 ) +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.ticks.x =element_blank(),axis.text.x =element_blank(),axis.title =element_text(size=15),title =element_text(size=15) ) +labs(title =bquote(bold("A)")~"Unexpected data range"),y ="Data value", x ="Data index")# panel B: distribution shapexB =c(5+rnorm(150), exp(1+rnorm(150)))data_B =tibble(data_index=1:length(xB), data_value=xB)pB =ggplot(data_B, aes(x=data_value))+geom_histogram(color="black", fill="gray", bins=50) +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.ticks.x =element_blank(),axis.text.x =element_blank(),axis.title =element_text(size=15),title =element_text(size=15) ) +labs(title =bquote(bold("B)")~"Nonstandard distribution"),y ="Count", x ="Data value")# panel C: mixed datasetsxC =c(4+rnorm(150), rnorm(150)-4)data_C =tibble(data_index =1:length(xC), data_value = xC)pC =ggplot(data_C, aes(x=data_value))+geom_histogram(color="black", fill="gray") +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.ticks.x =element_blank(),axis.text.x =element_blank(),axis.title =element_text(size=15),title =element_text(size=15) ) +labs(title =bquote(bold("C)")~"Mixed dataset"),y ="Count", x ="Data value")# panel D: outliersxD =rnorm(150)data_D =tibble(data_index=1:length(xD), data_value=xD)data_D[60, "data_value"] =10data_D[84, "data_value"] =14pD =ggplot(data_D, aes(x =1:nrow(data_D), y = data_value)) +geom_point(shape=22,size=5,fill ="gray",alpha=.8 ) +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.ticks.x =element_blank(),axis.text.x =element_blank(),axis.title =element_text(size=15),title =element_text(size=15) ) +labs(title =bquote(bold("D)")~"Outliers"),y ="Data value", x ="Data index")p7.2= pA + pB + pC + pDp7.2```# Figure 7.4: Example of dataset with outliers## Python version:```{python}# Create normally distributed dataN =100data = np.random.randn(N)# and add two random outliers in random positionsdata[np.random.choice(np.arange(N),2)] = np.random.uniform(2,3,2)**2# and plotplt.figure(figsize=(8,4))plt.plot(data,'ks',markersize=10,markerfacecolor=(.7,.7,.7))plt.xlim([-2,N+1])plt.xlabel('Data index')plt.ylabel('Data value')plt.tight_layout()plt.show()```## R version:```{r}#| fig-width: 8#| fig-height: 4# Create normally distributed dataN =100y =rnorm(N)data =tibble(data_index =1:length(y), data_value = y)# and add two random outliers in random positiionsdata[sample(N, 2), "data_value"] =runif(2, min=2, max=3)^2# and plot:p7.4=ggplot(data, aes(x = data_index, y = data_value)) +geom_point(shape=22, size=5,fill ="gray",alpha=.8 ) +xlim(-2, N+1) +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(y ="Data value", x ="Data index")p7.4```# Figure 7.6: Z-score method for identifying outliers## Python version:```{python}# outlier thresholdzThreshold =3.09# create some raw dataN =135data = np.exp(np.random.randn(N)/2) +5# zscore the datadataZ = (data-np.mean(data)) / np.std(data,ddof=1)# identify data indices containing outliersoutliers = np.where(np.abs(dataZ)>zThreshold)[0]# and plot_,axs = plt.subplots(1,2,figsize=(10,4))axs[0].plot(data,'ks',markersize=10,markerfacecolor=(.7,.7,.7))axs[0].set(xlim=[-2,N+1],xlabel='Data index',ylabel='Data value')axs[0].set_title(r'$\bf{A}$) Original data')axs[1].plot(dataZ,'ks',markersize=10,markerfacecolor=(.9,.9,.9))axs[1].axhline(zThreshold,linestyle='--',color=(.9,.9,.9))axs[1].plot(outliers,dataZ[outliers],'kx',markersize=10,markeredgewidth=2)axs[1].set(xlim=[-3,N+2],xlabel='Data index',ylabel='Transformed data value')axs[1].set_title(r'$\bf{B}$) Z-transformed data')plt.tight_layout()plt.show()```## R version:```{r}#| fig-width: 10#| fig-height: 4# outlier thresholdzThreshold =3.09# create some raw dataN =135data =exp(rnorm(N)/2) +5# zscore the data:dataZ = (data -mean(data))/sd(data) # sd uses n-1 dof# identify data indices containing outliersoutliers =abs(dataZ) > zThreshold# and plotdata =tibble(data_index =1:length(data), data_value = data)p1 =ggplot(data, aes(x = data_index, y = data_value)) +geom_point(shape=22, size=5,fill ="#969696",alpha=.8 ) +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(title =bquote(bold("A)")~"Original data"),y ="Data value", x ="Data index")dataZ =tibble(data_index =1:length(dataZ), data_value = dataZ)p2 =ggplot(dataZ, aes(x=data_index, y = data_value)) +geom_point(shape=22, size=5,fill ="gray",alpha=.8 ) +geom_point(data = dataZ[outliers,],aes(x = data_index, y=data_value),shape=7,stroke=1,size=3,fill ="gray",alpha=.8 ) +geom_hline(yintercept = zThreshold, linetype="dashed", linewidth=.7,color="gray", ) +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(title =bquote(bold("A)")~"Z-transformed data"),y ="Transformed data value", x ="Data index")p7.6= p1 + p2p7.6```# Figure 7.7: Impact of removing outliers on z-values## Python version:```{python}# create some raw dataN =10# sample sizedata = np.exp(np.random.randn(N)/2) +5data[-1] = np.max(data)+2# impose an outlier (at the end for convenience)xvals = np.arange(N)dataZ1 = (data-np.mean(data)) / np.std(data,ddof=1)dataZ2 = (data[:-1]-np.mean(data[:-1])) / np.std(data[:-1],ddof=1)_,axs = plt.subplots(1,2,figsize=(10,4))axs[0].plot(xvals,data,'ks',markersize=10,markerfacecolor=(.7,.7,.7))axs[0].set(xticks=[],xlabel='Data index',ylabel='Raw data value')axs[0].set_title(r'$\bf{A}$) Raw data')axs[1].plot(xvals,dataZ1,'ks',markersize=10,markerfacecolor=(.7,.7,.7),label='Z with outlier')axs[1].plot(xvals[:-1],dataZ2,'ko',markersize=10,markerfacecolor=(.5,.5,.5),label='Z without outlier')axs[1].set(xticks=[],xlabel='Data index',ylabel='Transformed data value')axs[1].legend()axs[1].set_title(r'$\bf{B}$) Z-transformed data')# draw lines connection pre/post-removal valuesfor d,z,x inzip(dataZ1[:-1],dataZ2,xvals[:-1]): axs[1].plot([x,x],[d,z],':',color=(.7,.7,.7),zorder=-10)plt.tight_layout()plt.show()```## R version:```{r}#| fig-width: 10#| fig-height: 4# create some raw dataN =10# sample sizedata =exp(rnorm(N)/2) +5data[N] =max(data) +2# impose an outlier (at the end for convenience)dataZ1 = (data -mean(data)) /sd(data) # sd uses n-1 dofdataZ2 = (data[-N] -mean(data[-N])) /sd(data[-N]) # sd uses n-1 dof# and plot:data =tibble(data_index =1:length(data), data_value = data)pA =ggplot(data, aes(x = data_index, y = data_value)) +geom_point(shape=22, size=5,fill ="gray" ) +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.text.x=element_blank(),axis.ticks.x=element_blank(),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(title =bquote(bold("A)")~"Raw data"),y ="Raw data value", x ="Data index")dataZ1 =tibble(data_index =1:length(dataZ1), data_value = dataZ1, id="Z with outlier")dataZ2 =tibble(data_index =1:length(dataZ2), data_value = dataZ2, id="Z without outlier")dataZ =rows_append(dataZ1, dataZ2)pB =ggplot(dataZ, aes(x = data_index, y=data_value, group = data_index,shape=id,fill=id)) +geom_line(linetype="dashed", linewidth=.3 ) +geom_point(size=5 ) +scale_shape_manual(values=c("Z with outlier"=22,"Z without outlier"=21)) +scale_fill_manual(values=c("Z with outlier"="#bdbdbd","Z without outlier"="#525252")) +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.text.x=element_blank(),axis.ticks.x=element_blank(),legend.position =c(.8,.9),legend.title =element_blank(),legend.box.background =element_rect(color="gray", linewidth =1),legend.text =element_text(size =12),legend.key =element_blank(),legend.key.width =unit(1, "cm"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(shape="", fill="",title =bquote(bold("B)")~"Z-transformed data"),y ="Transformed data value", x ="Data index")p7.7= pA + pBp7.7```# Figure 7.9: Data trimming## Python version:```{python}N =74data = np.random.randn(N)**3# find largest and smallest valuesk =2sortidx = np.argsort(data)minvals = sortidx[:k]maxvals = sortidx[-k:]_,axs = plt.subplots(2,1,figsize=(8,6))axs[0].plot(data,'ks',markersize=10,markerfacecolor=(.9,.9,.9))axs[0].plot(minvals,data[minvals],'kx',markersize=10,markeredgewidth=2)axs[0].plot(maxvals,data[maxvals],'kx',markersize=10,markeredgewidth=2)axs[0].set_title(r'$\bf{A}$) Data with k-extreme points trimmed')# create a Gaussian probability curve for the panel Bx = np.linspace(-4,4,401)gpdf = stats.norm.pdf(x)# the find the indices of the 2.5% and 97.5%lbndi = np.argmin(np.abs(x-stats.norm.ppf(.025))) # lbndi = Lower BouND Indexubndi = np.argmin(np.abs(x-stats.norm.ppf(1-.025)))# plot the probability function and the vertical linesaxs[1].plot(x,gpdf,'k',linewidth=2)axs[1].axvline(x[lbndi],color=(.5,.5,.5),linewidth=.5,linestyle='--')axs[1].axvline(x[ubndi],color=(.5,.5,.5),linewidth=.5,linestyle='--')axs[1].set(xlim=x[[0,-1]],ylim=[0,.42])axs[1].set_title(r'$\bf{B}$) Histogram showing trimmed areas')# now create patches for the rejected areaaxs[1].fill_between(x[:lbndi+1],gpdf[:lbndi+1],color='k',alpha=.4)axs[1].fill_between(x[ubndi:],gpdf[ubndi:],color='k',alpha=.4)# and saveplt.tight_layout()plt.show()```## R Version:```{r}#| fig-width: 8#| fig-height: 6N =74y =rnorm(N)^3data =tibble(data_index=1:length(y), data_value=y)# find largest and smallest valuesk =2# sorting ascending order:data =arrange(data, data_value)largest =head(data, k)smallest =tail(data, k)pA =ggplot(data, aes(x = data_index, y = data_value)) +geom_point(shape=22, size=5,fill ="gray" ) +geom_point(data = largest,aes(x = data_index, y = data_value),shape=7,size=5,stroke=1,fill ="gray",alpha=.8 ) +geom_point(data = smallest,aes(x = data_index, y = data_value),shape=7,size=5,stroke=1,fill ="gray",alpha=.8 ) +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(title =bquote(bold("A)")~"Data with k-extreme points trimmed"),y ="", x ="")# create a Gaussian probability curve for the panel Bx =seq(-4,4,length=401)gpdf =dnorm(x, mean =0, sd =1)data =tibble(data_index = x, data_value = gpdf)# find the indices of the 2.5% and 97.5%lbndi =which.min(abs(x -qnorm(0.025))) # lower bound indexubndi =which.min(abs(x -qnorm(0.975))) # upper bound index# plot the probability function and the vertical linespB =ggplot(data = data, aes(x = data_index, y = data_value)) +geom_line() +geom_vline(xintercept = x[lbndi], linetype="dashed") +geom_ribbon(data =filter(data, data_index <= x[lbndi]), aes(ymax = data_value, ymin=0),fill="#969696") +geom_vline(xintercept = x[ubndi], linetype="dashed") +geom_ribbon(data =filter(data, data_index >= x[ubndi]), aes(ymax = data_value, ymin=0),fill="#969696") +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(title =bquote(bold("B)")~"Histogram showing trimmed areas"),y ="", x ="")p7.9= pA + pB +plot_layout(ncol=1)p7.9```# Exercise 1## Python version:```{python}## iterative method# Note about this code: Because of random numbers, you are not guaranteed to get a result# that highlights the method. Try running the code several times.N =30data = np.random.randn(N)data[data<-1] = data[data<-1]+2data[data>1.5] = data[data>1.5]**2;# try to force a few outliers# pick a lenient threshold just for illustrationzscorethresh =2dataZ = (data-np.mean(data)) / np.std(data,ddof=1)plt.figure(figsize=(10,4))colorz ='brkmc'numiters =0# iteration counterwhileTrue:# convert to z datamean = np.nanmean(dataZ) datastd = np.nanstd(dataZ,ddof=1) dataZ = (dataZ-datamean) / datastd# find data values to remove toremove = dataZ>zscorethresh# break out of while loop if no points to removeifsum(toremove)==0:breakelse:# otherwise, mark the outliers in the plot plt.plot(np.where(toremove)[0]+numiters/5,dataZ[toremove],'%sx'%colorz[numiters], markersize=12,markeredgewidth=3) dataZ[toremove] = np.nan# replot plt.plot(np.arange(N)+numiters/5,dataZ,linestyle='None',marker=f'${numiters}$',markersize=12, color=colorz[numiters])# update counter numiters = numiters +1plt.ylabel('Z-score')plt.xlabel('Data index')plt.tight_layout()plt.show()```## R version:Some notes: if I understand correctly, you want to simulate some data, add some outliers and calculate the z-scores and removing them to see the behavior of the dataset.In each iteration, you want to find all outliers, plot the all the datapoints, and mark the outliers to remove them in the next round.If that's the case, I take the liberty to reorganize the code and do the following:- Commented the part of the original `data` does not get z-scored twice in the first iteration;- Reorganized the `replot section` in the for-loop so that the data points are plotted even when no outlier is found. This will prevent the case of no-plotting when no outlier is found in the first iteration.Please, let me know if these changes make sense. In any case, I can follow strictly to the python code.```{r}#| fig-width: 10#| fig-height: 4## iterative method# Note about this code: Because of random numbers, you are not guaranteed to get a result# that highlights the method. Try running the code several times.# Simulate the data:N =30data =rnorm(N)data[data <-1] = data[data <-1] +2data[data >1.5] = data[ data >1.5]^2# pick a lenient threshold just for illustrationzscorethresh =2# ======================================================= # # if this part is included, then the first iteration will be # z-scored two times.# dataZ = (data-mean(data)) / sd(data)# color pallete:colorz =c("blue", "black", "red", "magenta", "cyan")# start the skeleton of the plot:p =ggplot() +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(y ="Z-score", x ="Data index")# initiate counter:numiters =1# initiate the dataZ with original data:dataZ = datawhile(TRUE) {# convert to z-score: datamean =mean(dataZ, na.rm = T) datastd =sd(dataZ, na.rm = T) dataZ = (dataZ - datamean)/datastd# find data values to remoove: toRemove = dataZ > zscorethresh# plot points in the current iteration: nmb = numiters color_ = colorz[nmb] data_temp =tibble(data_index =1:N + numiters/5, data_value = dataZ) p <- p +geom_text(data = data_temp,aes(x = data_index, y = data_value),label = nmb,size =5,color = color_ )# break out of the while loop if no points to remove:if (sum(toRemove, na.rm = T) ==0) {break } else{# otherwise, mark the outliers in the plot for the current iteration: outlier_data =tibble(data_index =which(toRemove) + numiters/5,data_value = dataZ[toRemove] ) color_ = colorz[numiters] p <- p +geom_point(data = outlier_data,aes(x=data_index, y=data_value),shape =4,color = color_,size=2,stroke=1 ) dataZ[toRemove] =NaN }# update counter: numiters <- numiters +1}p```# Exercise 2## Python version```{python}# create dataN =10000Y = np.exp(np.sin(np.random.randn(N)))# make a copy of the data to manipulateYc = Y.copy()# not specified in the instructions, but always a good idea to inspect the data!plt.hist(Y,bins=40)plt.show()``````{python}# percent to remove (two-tailed)k =4# convert that to a number of data points to remove from each tailpnts2nan =int( (k/2)/100* N ) # with stated parameters, this should be 200# find the data sortingsort_idx = np.argsort(Y)# nan the two tails separatelyYc[sort_idx[:pnts2nan]] = np.nanYc[sort_idx[-pnts2nan:]] = np.nan# confirm the right numbers of pointsprint(f'Total dataset size: {len(Yc)}')print(f'Valid dataset size: {np.sum(~np.isnan(Yc))}')``````{python}# compute the mean and median (also used in the next exercise)meanY = np.mean(Y)medianY = np.median(Y)# print the meansprint(f'Mean of original: {meanY:.3f}')print(f'Mean of trimmed: {np.nanmean(Yc):.3f}')# print the mediansprint(' ')print(f'Median of original: {medianY:.3f}')print(f'Median of trimmed: {np.nanmedian(Yc):.3f}')```## R version:```{r}# create dataN =10000Y =exp(sin(rnorm(N)))# make a copy of the data to manipulate:data =tibble(data_index =1:length(Y), data_value = Y)ggplot(data, aes(x = data_value)) +geom_histogram(bins =40, fill="#2c7fb8", color="white") +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(y ="", x ="")``````{r}# percent to remove (two-tailed)k =4# finding threshold in the distribution:q = k/2# percent from each sidethresh =quantile(data$data_value, c(q/100, 1-q/100))# data with two-tailed points removed:data_twoTailed_rmvd = data %>%filter(data_value > thresh[1] & data_value < thresh[2])# confirm the right number of points:print(glue("Total dataset size: {nrow(data)}"))print(glue("Total dataset size: {nrow(data_twoTailed_rmvd)}"))``````{r}# compute the mean and median (also used in the next exercise)mean_Y =mean(data$data_value)median_Y =median(data$data_value)mean_Ytrimmed =mean(data_twoTailed_rmvd$data_value)median_Ytrimmed =median(data_twoTailed_rmvd$data_value)# print the meansprint(glue("Mean of original: {sprintf('%.3f', mean_Y)}"))print(glue("Mean of trimmed: {sprintf('%.3f', mean_Ytrimmed)}"))# print the mediansprint(glue("Median of original: {sprintf('%.3f', median_Y)}"))print(glue("Median of trimmed: {sprintf('%.3f', median_Ytrimmed)}"))```# Exercise 3## Python version:```{python}# the range of k valuesks = np.arange(1,50,3)# initialize a results matrix for mean/medianresults = np.zeros((len(ks),2))# the experiment!for idx,ki inenumerate(ks):# make a new copy of the original data Yc = Y.copy() # Note: Y was defined in Exercise 2# convert that to a number of data points to remove from each tail pnts2nan =int( (ki/2)/100* N )# nan the two tails separately Yc[sort_idx[:pnts2nan]] = np.nan Yc[sort_idx[-pnts2nan:]] = np.nan# collect mean and median results[idx,0] =100*(np.nanmean(Yc)-meanY) / meanY results[idx,1] =100*(np.nanmedian(Yc)-medianY) / medianYprint(f'Total/valid dataset size: {len(Yc)} -> {np.sum(~np.isnan(Yc))}')``````{python}# plot the resultsplt.figure(figsize=(8,4))plt.plot(ks,results[:,0],'s-',color=[.6,.6,.6],markerfacecolor=[.8,.8,.8],markersize=10,label='Mean')plt.plot(ks,results[:,1],'o-',color='k',markerfacecolor=[.4,.4,.4],markersize=10,label='Median')plt.legend()plt.xlabel('k% to trim')plt.ylabel(r'Descriptive value (%$\Delta$)')plt.tight_layout()plt.show()```## R version```{r}#| fig-width: 8#| fig-height: 4# the range of k valuesks =seq(1, 50, 3)# initialize a results matrix for mean/medianresults =matrix(0, length(ks), 2)# the experiment!for (idx inseq_along(ks)) {#idx = 1# convert that to a number of data points to remove from each tail: q = ks[idx]/2# get the thresholds: thresh =quantile(data$data_value, c(q/100, 1-q/100))# get the remaining data points: data_temp = data %>%filter(data_value > thresh[1] & data_value < thresh[2])# collect mean and median and calculate % difference from original ones: results[idx, 1] =100*(mean(data_temp$data_value) -mean(data$data_value)) /mean(data$data_value) results[idx, 2] =100*(median(data_temp$data_value) -median(data$data_value)) /median(data$data_value)print(glue("Total/valid dataset size: {nrow(data)} -> {nrow(data_temp)}"))}``````{r}#| fig-width: 8#| fig-height: 4ggplot(tibble(data_index = ks, mean = results[,1], median = results[,2])) +geom_point(aes(x = data_index, y = mean, color ="Mean",fill ="Mean",shape ="Mean" ),size=4 ) +geom_line(aes(x = data_index, y = mean, color ="Mean")) +geom_point(aes(x = data_index, y = median,color ="Median", fill ="Median", shape ="Median"),size=4 ) +geom_line(aes(x = data_index, y = median, color="Median")) +scale_shape_manual(values =c("Mean"=22,"Median"=21) ) +scale_color_manual(values =c("Mean"="#bdbdbd","Median"="#525252") ) +scale_fill_manual(values =c("Mean"="#bdbdbd","Median"="#525252") ) +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), legend.position =c(.1,.1),legend.title =element_blank(),legend.box.background =element_rect(color="gray", linewidth =1),legend.text =element_text(size =12),legend.key =element_blank(),legend.key.width =unit(1, "cm"),axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(fill="", color ="", shape ="", y =expression(paste("Descriptive value (%",Delta,")")), x ="k% to trim")```# Exercise 4## Python version:```{python}# create dataN =1000X = np.random.f(5,100,size=N)# zscore dataXz = (X-np.mean(X)) / np.std(X,ddof=1)zThresh =3# clean dataXclean = X[Xz<zThresh]# report number of removed data pointsprint(f'Original sample size: {N}')print(f'Cleaned sample size: {len(Xclean)}')print(f'Percent data removed: {100*(1-len(Xclean)/N):.2f}%')``````{python}# histogram bins using FD ruley_fd_all = np.histogram(X,bins='fd')y_fd_clean = np.histogram(Xclean,bins='fd')# histogram bins using set boundaries# Note that I create the bin boundaries using k+1 numbers, then input that vector of boundaries into np.histogramedges = np.linspace(np.min(X),np.max(X),41)y_40_all = np.histogram(X,bins=edges)y_40_clean = np.histogram(Xclean,bins=edges)# plotting the histograms_,axs = plt.subplots(2,1,figsize=(8,7))axs[0].plot((y_fd_all[1][:-1]+y_fd_all[1][1:])/2,y_fd_all[0],'ks-', label='Pre-cleaned',markersize=11,markerfacecolor=(.6,.6,.6))axs[0].plot((y_fd_clean[1][:-1]+y_fd_clean[1][1:])/2,y_fd_clean[0],'ko-', label='Cleaned',markersize=10,markerfacecolor=(.9,.9,.9))axs[0].set_title(r'$\bf{A}$) Histograms using F-D rule')axs[1].plot((y_40_all[1][:-1]+y_40_all[1][1:])/2,y_40_all[0],'ks-', label='Pre-cleaned',markersize=11,markerfacecolor=(.6,.6,.6))axs[1].plot((y_40_clean[1][:-1]+y_40_clean[1][1:])/2,y_40_clean[0],'ko-', label='Cleaned',markersize=10,markerfacecolor=(.9,.9,.9))axs[1].set_title(r'$\bf{B}$) Histograms using 40 bins')# axis adjustmentsfor a in axs: a.legend() a.set(xlabel='F value',ylabel='Count', xlim=[np.min(X)-.02,np.max(X)+.02],xticks=range(6))plt.tight_layout()plt.savefig('dataQC_ex4.png')plt.show()```## R Version```{r}#| fig-width: 8#| fig-height: 7N =1000x =rf(df1 =5, df2 =100, n = N)# zscore dataxZ = (x -mean(x)) /sd(x) # sd uses n-1 dofzThresh =3# clean dataxClean = x[xZ < zThresh]# report number of removed data points:print(glue("Original sample size: {N}"))print(glue("Cleaned sample size: {length(xClean)}"))print(glue("Percent data removed: {sprintf('%.2f', 100*(1 - length(xClean)/N))}%"))# histogram bins using FD ruleedges_fd_all =seq(min(x), max(x), length=nclass.FD(x))y_fd_all =hist(x, breaks = edges_fd_all, plot=F)edges_fd_clean =seq(min(xClean), max(xClean), length=nclass.FD(xClean))y_fd_clean =hist(xClean, breaks = edges_fd_clean, plot=F)# function to creating dataframeconvert_to_DF =function(y){ df_ =tibble(data_index = y$mids,data_value = y$counts )return (df_)}y_fd_all =convert_to_DF(y_fd_all)y_fd_clean =convert_to_DF(y_fd_clean)# histogram bins using set boundaries# Note that I create the bin boundaries using k+1 numbers, then input that # vector of boundaries into np.histogramedges =seq(min(x), max(x), length=41)y_40_all =hist(x, breaks = edges, plot=F)y_40_clean =hist(xClean, breaks = edges, plot=F)# convert to DF:y_40_all =convert_to_DF(y_40_all)y_40_clean =convert_to_DF(y_40_clean)# plotting the histograms:pA =ggplot() +geom_line(data = y_fd_all,aes(x = data_index, y = data_value,color ="Pre-cleaned", ),size = .5 ) +geom_point(data = y_fd_all,aes(x = data_index, y = data_value,color ="Pre-cleaned", fill ="Pre-cleaned",shape ="Pre-cleaned" ),size =5,stroke = .2 ) +geom_line(data = y_fd_clean,aes(x = data_index, y = data_value,color ="Cleaned" ),size =0.5 ) +geom_point(data = y_fd_clean,aes(x = data_index, y = data_value,color ="Cleaned",fill ="Cleaned",shape ="Cleaned" ),size =4,stroke = .5 ) +scale_shape_manual(name ="",values =c("Pre-cleaned"=22,"Cleaned"=21 ) ) +scale_color_manual(name ="",values =c("Cleaned"="#252525","Pre-cleaned"="#252525" ) ) +scale_fill_manual(name ="",values =c("Cleaned"="#bdbdbd","Pre-cleaned"="#525252" ) ) +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), legend.position =c(.8,.9),legend.title =element_blank(),legend.box.background =element_rect(color="gray", linewidth =1),legend.text =element_text(size =12),legend.key =element_blank(),legend.key.width =unit(1, "cm"),axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(fill="", color ="", shape ="", title =bquote(bold("A)")~"Histogram using F-D rule"),y ="Count", x ="F value")pB =ggplot() +geom_line(data = y_40_all,aes(x = data_index, y = data_value,color ="Pre-cleaned", ),size = .5 ) +geom_point(data = y_40_all,aes(x = data_index, y = data_value,color ="Pre-cleaned", fill ="Pre-cleaned",shape ="Pre-cleaned" ),size =5,stroke = .2 ) +geom_line(data = y_40_clean,aes(x = data_index, y = data_value,color ="Cleaned" ),size =0.5 ) +geom_point(data = y_40_clean,aes(x = data_index, y = data_value,color ="Cleaned",fill ="Cleaned",shape ="Cleaned" ),size =4,stroke = .5 ) +scale_shape_manual(name ="",values =c("Pre-cleaned"=22,"Cleaned"=21 ) ) +scale_color_manual(name ="",values =c("Cleaned"="#252525","Pre-cleaned"="#252525" ) ) +scale_fill_manual(name ="",values =c("Cleaned"="#bdbdbd","Pre-cleaned"="#525252" ) ) +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), legend.position =c(.8,.9),legend.title =element_blank(),legend.box.background =element_rect(color="gray", linewidth =1),legend.text =element_text(size =12),legend.key =element_blank(),legend.key.width =unit(1, "cm"),axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(fill="", color ="", shape ="", title =bquote(bold("B)")~"Histogram using 40 bins"),y ="Count", x ="F value")pExercise4 = pA + pB +plot_layout(ncol=1)pExercise4```# Exercise 5## Python version:```{python}# url reference: https://archive.ics.uci.edu/ml/datasets/Arrhythmia# import datadf = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data', usecols = np.arange(9), names = ['age','sex','height','weight','qrs','p-r','q-t','t','p'])# inspectdf.head()``````{python}# boxplots of raw dataplt.figure(figsize=(10,5))sns.boxplot(data=df).set(xlabel='Data feature',ylabel='Data value')plt.show()``````{python}# make a copy of the original data matrixdf_z = df.copy()for col in df_z.columns:ifnot (col=='sex'): df_z[col] = (df[col] - df[col].mean()) / df[col].std(ddof=1)# inspect againdf_z``````{python}# box plots of z-scored dataplt.figure(figsize=(10,5))sns.boxplot(data=df_z).set(xlabel='Data feature',ylabel='Data value')plt.show()``````{python}# Note: this cell combines the previous graphs to make one figure for the book_,axs = plt.subplots(2,1,figsize=(10,7))sns.boxplot(data=df, ax=axs[0]).set(xticks=[],ylabel='Data value')axs[0].set_title(r'$\bf{A}$) Raw data')sns.boxplot(data=df_z,ax=axs[1]).set(xlabel='Data feature',ylabel='Transformed data value')axs[1].set_title(r'$\bf{B}$) Z-transformed data')plt.tight_layout()plt.show()```## R version```{r}#| message: false#| warning: false# import datalibrary(readr)data <-read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data',col_select =1:9,col_names =c('age','sex','height','weight','qrs','p-r','q-t','t','p'))# inspecthead(data, 5)``````{r}#| fig-width: 10#| fig-height: 5ggplot(stack(data), aes(x = ind, y = values, fill=ind)) +geom_boxplot() +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), legend.position ="none",legend.title =element_blank(),legend.box.background =element_rect(color="gray", linewidth =1),legend.text =element_text(size =12),legend.key =element_blank(),legend.key.width =unit(1, "cm"),axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(fill="", color ="", shape ="", title ="",y ="Data value", x ="Data features")``````{r}# scale the data:cols_ =setdiff(names(data), "sex") # all columns, but 'sex' featuredataZ = data %>%mutate(across(all_of(cols_), function(x) (x -mean(x))/sd(x)))# inspecting againhead(dataZ)``````{r}#| fig-width: 10#| fig-height: 5# box plots of z-scoredggplot(stack(dataZ), aes(x = ind, y = values, fill=ind)) +geom_boxplot() +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), legend.position ="none",legend.title =element_blank(),legend.box.background =element_rect(color="gray", linewidth =1),legend.text =element_text(size =12),legend.key =element_blank(),legend.key.width =unit(1, "cm") ,axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(fill="", color ="", shape ="", title ="",y ="Data value", x ="Data features")``````{r}#| fig-width: 10#| fig-height: 7# Note: this cell combines the previous graphs to make one figure for the bookp_boxPlot_all =ggplot(stack(data), aes(x = ind, y = values, fill=ind)) +geom_boxplot() +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), legend.position ="none",legend.title =element_blank(),legend.box.background =element_rect(color="gray", linewidth =1),legend.text =element_text(size =12),legend.key =element_blank(),legend.key.width =unit(1, "cm"),axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(fill="", color ="", shape ="", title =bquote(bold("A)")~"Raw data"),y ="Data value", x ="")p_boxPlot_all# box plots of z-scoredp_boxPlot_z =ggplot(stack(dataZ), aes(x = ind, y = values, fill=ind)) +geom_boxplot() +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), axis.line =element_line(colour ="black"),legend.position ="none",legend.title =element_blank(),legend.box.background =element_rect(color="gray", linewidth =1),legend.text =element_text(size =12),legend.key =element_blank(),legend.key.width =unit(1, "cm"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(fill="", color ="", shape ="", title =bquote(bold("B)")~"Z-transformed data"),y ="Data value", x ="Data features")p_boxPlot = p_boxPlot_all + p_boxPlot_z +plot_layout(ncol =1)p_boxPlot```# Exercise 6## Python version:```{python}# remove based on z-score thresholdzThresh =3.09# p<.001df_clean = df.copy()df_clean[df_z>zThresh] = np.nan # positive taildf_clean[df_z<-zThresh] = np.nan # negative tail``````{python}# plot_,axs = plt.subplots(2,1,figsize=(10,7))sns.boxplot(data=df,ax=axs[0]).set(xticks=[],ylabel='Data value')axs[0].set_title(r'$\bf{A}$) Raw data')sns.boxplot(data=df_clean,ax=axs[1]).set(xlabel='Data feature',ylabel='Data value')axs[1].set_title(r'$\bf{B}$) Cleaned data')plt.tight_layout()plt.savefig('dataQC_ex6a.png')plt.show()``````{python}# print the meansraw_means = df.mean().valuescleaned_means = df_clean.mean().valuesfor name,pre,post inzip(df.columns,raw_means,cleaned_means):print(f'{name:>6}: {pre:6.2f} -> {post:6.2f}')``````{python}# compute percent changepctchange =100*(cleaned_means-raw_means) / raw_means# and plotplt.figure(figsize=(9,4))plt.plot(pctchange,'ks',markersize=14,markerfacecolor=(.7,.7,.7))plt.axhline(0,color='k',linewidth=2,zorder=-1)plt.xticks(range(9),labels=df.columns)plt.ylabel('Percent')plt.title('Change in feature means after z-score data rejection',loc='center')plt.grid()plt.tight_layout()plt.savefig('dataQC_ex6b.png')plt.show()```## R version :```{r}# remove based on z-score thresholdzThresh =3.09# p<.001# creating copy:data_clean = data# loop over all columnsfor (col_ incolnames(dataZ)) { data_clean[[col_]][dataZ[[col_]] > zThresh] =NA data_clean[[col_]][dataZ[[col_]] <-zThresh] =NA}``````{r}#| fig-width: 10#| fig-height: 7pA =ggplot(stack(data), aes(x = ind, y = values, fill=ind)) +geom_boxplot() +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), legend.position ="none",legend.title =element_blank(),legend.box.background =element_rect(color="gray", linewidth =1),legend.text =element_text(size =12),legend.key =element_blank(),legend.key.width =unit(1, "cm"),axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(fill="", color ="", shape ="", title ="",y ="Data value", x ="")pB =ggplot(stack(data_clean), aes(x = ind, y = values, fill=ind)) +geom_boxplot() +theme(panel.grid.major =element_blank(), panel.grid.minor =element_blank(),panel.background =element_blank(), legend.position ="none",legend.title =element_blank(),legend.box.background =element_rect(color="gray", linewidth =1),legend.text =element_text(size =12),legend.key =element_blank(),legend.key.width =unit(1, "cm"),axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15) ) +labs(fill="", color ="", shape ="", title ="",y ="Data value", x ="Data features")pA + pB +plot_layout(ncol=1)``````{r}# print the meansraw_means = data %>%summarize(across(everything(), mean))clean_means = data_clean %>%summarize(across(everything(), ~mean(.x, na.rm=T)))for (col_ incolnames(data)) {print(glue('{str_pad(col_, 6, "left", " ")}: {sprintf("%6.2f", raw_means[[col_]])} -> {sprintf("%6.2f", clean_means[[col_]])}'))}``````{r}#| fig-width: 9#| fig-height: 4# compute percent changepctchange =100*(clean_means - raw_means) / raw_meansggplot(data=stack(pctchange), aes(x = ind, y = values)) +geom_hline(yintercept =0,color="black" ) +geom_point(shape =22,fill ="#969696",size=6 ) +theme_bw() +theme(legend.position ="none",legend.title =element_blank(),legend.box.background =element_rect(color="gray", linewidth =1),legend.text =element_text(size =12),legend.key =element_blank(),legend.key.width =unit(1, "cm"),axis.line =element_line(colour ="black"),axis.title =element_text(size=15),axis.text =element_text(size=15),title =element_text(size=15),plot.title =element_text(hjust = .5) ) +labs(title ="Change in feature means after z-score data rejection",y ="Percent", x ="")```